you need to have run the regular notebook before
from __future__ import print_function
import os.path
import pandas as pd
import sys
sys.path.insert(0, '../../')
import seaborn as sns
import numpy as np
from scipy.stats import spearmanr
from JKBio.Helper import *
from JKBio.helper import pyDESeq2
from bokeh.plotting import *
from bokeh.models import HoverTool
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.manifold import MDS, TSNE
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
#from umap import UMAP
output_notebook()
%load_ext autoreload
%matplotlib inline
%autoreload 2
%load_ext rpy2.ipython
downloading their data
version="vAll"
project="slamseqMax"
location= '../data/'+project+"/"
%store -r res
%store -r tccounts
%store -r readcounts
%store -r tccountsMyci
%store -r tccountsMybi
%store -r tccountsMEF2D
%store -r designMS2
%store -r designJQ1
%store -r designMS2_JQ1
%store -r designMybi30
%store -r designMybi6
%store -r designMybi24
%store -r designMEF2D2
%store -r designMEF2D24
%store -r ctf
minvar_toremove=0
mincount_toremove=5
For gene-level analysis, raw reads mapped to different UTR annotations of the same gene were summed up by Entrez Gene ID. Pilot studies of K562 cells with kinase inhibitors were performed as single experiments.
Analysis of differential gene expression was restricted to genes with ≥ 10 reads in at least one condition for 50bp sequencing runs (flavopiridol and DMSO) or ≥ 20 reads in at least one condition for 100bp sequencing runs (mk2206, trametinib, nilotinib, trametinib + mk2206 and DMSO). For estimating differential expression, a pseudo-count of 1 raw read was added to all genes.
Differential gene expression calling was performed on raw read counts with ≥ 2 T>C conversions using DESeq2 (version 1.14.1) with default settings, and with size factors estimated on corresponding total mRNA reads for global normalization.
Downstream analysis was restricted to genes passing all internal filters for FDR estimation by DESeq2. Principal component analysis was performed after variance stabilizing transformation on the 500 most variable genes across all conditions of a given experiment. GO-term enrichment analysis was performed on genes significantly and strongly downregulated (FDR ≤ 0.1, log2FC ≤ -1) in SLAM-seq upon IAA-treatment in K562MYC-AID + Tir1 by the PANTHER Overrepresentation Test (Fisher's Exact with FDR multiple test correction, release 20171205, http://pantherdb.org) on GO Ontology database Released 2017-12-27.
scaling="ERCCsamplewise_TotalC"
readcountsMyci= readcounts[list(tccountsMyci.columns)]
MS2 = [1,1,1,0,0,0,0,0,0,1,1,1]
deseqMS2 = pyDESeq2.pyDESeq2(count_matrix=readcountsMyci[readcountsMyci.columns[np.array(MS2+[1],np.bool)]], design_matrix=designMS2[np.array(MS2,np.bool)],
design_formula="~DMSO - VHL",
gene_column="genes")
JQ1 = [0,0,0,1,1,1,0,0,0,1,1,1]
deseqJQ1 = pyDESeq2.pyDESeq2(count_matrix=readcountsMyci[readcountsMyci.columns[np.array(JQ1+[1],np.bool)]],
design_matrix=designJQ1[np.array(JQ1,np.bool)],
design_formula="~DMSO - VHL",
gene_column="genes")
MS2_JQ1 = [0,0,0,0,0,0,1,1,1,1,1,1]
deseqMS2_JQ1 = pyDESeq2.pyDESeq2(count_matrix=readcountsMyci[readcountsMyci.columns[np.array(MS2_JQ1+[1],np.bool)]], design_matrix=designMS2_JQ1[np.array(MS2_JQ1,np.bool)],
design_formula="~DMSO - VHL",
gene_column="genes")
readcountsMybi= readcounts[list(tccountsMybi.columns)]
Mybi30 = [1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
deseqMybi30 = pyDESeq2.pyDESeq2(count_matrix = readcountsMybi[readcountsMybi.columns[np.array(Mybi30+[1],np.bool)]],
design_matrix=designMybi30[np.array(Mybi30,np.bool)],
design_formula="~DMSO - VHL",
gene_column="genes")
Mybi6 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,0,0,0,0,0,0,0]
deseqMybi6 = pyDESeq2.pyDESeq2(count_matrix = readcountsMybi[readcountsMybi.columns[np.array(Mybi6+[1],np.bool)]],
design_matrix=designMybi6[np.array(Mybi6,np.bool)],
design_formula="~DMSO - VHL",
gene_column="genes")
Mybi24 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1]
deseqMybi24 = pyDESeq2.pyDESeq2(count_matrix=readcountsMybi[readcountsMybi.columns[np.array(Mybi24+[1],np.bool)]], design_matrix=designMybi24[np.array(Mybi24,np.bool)],
design_formula="~DMSO - VHL",
gene_column="genes")
MEF2D2 = [1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0]
readcountsMEF2D= readcounts[list(tccountsMEF2D.columns)]
deseqMEF2D2 = pyDESeq2.pyDESeq2(count_matrix = readcountsMEF2D[readcountsMEF2D.columns[np.array(MEF2D2+[1],np.bool)]],
design_matrix=designMEF2D2[np.array(MEF2D2,np.bool)],
design_formula="~DMSO - VHL",
gene_column="genes")
MEF2D24 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
deseqMEF2D24 = pyDESeq2.pyDESeq2(count_matrix = readcountsMEF2D[readcountsMEF2D.columns[np.array(MEF2D24+[1],np.bool)]],
design_matrix=designMEF2D24[np.array(MEF2D24,np.bool)],
design_formula="~DMSO - VHL",
gene_column="genes")
deseqMS2.run_estimate_size_factors()
deseqJQ1.run_estimate_size_factors()
deseqMS2_JQ1.run_estimate_size_factors()
deseqMybi30.run_estimate_size_factors()
deseqMybi6.run_estimate_size_factors()
deseqMybi24.run_estimate_size_factors()
deseqMEF2D2.run_estimate_size_factors()
deseqMEF2D24.run_estimate_size_factors()
# from https://www.cell.com/trends/genetics/pdf/S0168-9525(13)00089-9.pdf FFROM THOUSANDS OF SAMPLES
housekeeping1 = ["C1orf43", "CHMP2A", "EMC7", "GPI", "PSMB2", "PSMB4", "RAB7A", "REEP5", "SNRPD3", "VCP", "VPS29"]
#https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4760967/ FOR CANCER CELL LINES
housekeeping2 = ['18S rRNA',
'ACTB',
'B2M',
'G6PD',
'GAPDH',
'GUSB',
'HMBS',
'HPRT1',
'PGK1',
'PPIA',
'RPL13a',
'SDHA',
'TBP',
'TUBB',
'YWHAZ']
housekeeping = readcounts.genes.isin(housekeeping2)
readcountsMybi= readcounts[readcounts.columns[16:-1]]
np.exp(np.mean(np.log(
readcountsMybi[readcountsMybi.columns[np.array([1,1,1,1,0,0,0], np.bool)]].values+1), 1))
readcountsMybi= readcounts[readcounts.columns[16:-1]]
deseqMybi.run_estimate_size_factors(controlGenes=housekeeping)
readcountsMEF2D= readcounts[readcounts.columns[:16]]
deseqMEF2D2.run_estimate_size_factors(controlGenes=housekeeping)
deseqMEF2D24.run_estimate_size_factors(controlGenes=housekeeping)
r
#### MYCi
sizeFact = deseqJQ1.getSizeFactors()
sizeFact
sizeFact[:3] = sizeFact[:3]* (res[[i for i in res.index if '-JQ1-' in i]].values/res[[i for i in res.index if '-DMSO-' in i]].values.mean())
sizeFact
deseqJQ1.setSizeFactors(sizeFact)
sizeFact = deseqMS2_JQ1.getSizeFactors()
sizeFact
sizeFact[:3] = sizeFact[:3]*(res[[i for i in res.index if '-MS2_JQ1-' in i]].values/res[[i for i in res.index if '-DMSO-' in i]].values.mean())
sizeFact
deseqMS2_JQ1.setSizeFactors(sizeFact)
#### MYBi
sizeFact = deseqMybi30.getSizeFactors()
sizeFact[:4] = sizeFact[:4]*(res[[i for i in res.index if '-MYBi_30m-' in i]].values/res[[i for i in res.index if '-PBS_30m-' in i]].values.mean())
deseqMybi30.setSizeFactors(sizeFact)
sizeFact = deseqMybi6.getSizeFactors()
sizeFact[:4] = sizeFact[:4]*(res[[i for i in res.index if '-MYBi_6h-' in i]].values/res[[i for i in res.index if '-PBS_6h-' in i]].values.mean())
deseqMybi6.setSizeFactors(sizeFact)
sizeFact = deseqMybi24.getSizeFactors()
sizeFact
sizeFact[4:] = sizeFact[4:]*(res[[i for i in res.index if '-MYBi_24h-' in i]].values/res[[i for i in res.index if '-PBS_24h-' in i]].values.mean())
deseqMybi24.setSizeFactors(sizeFact)
#### MEF2D
sizeFact = deseqMEF2D24.getSizeFactors()
sizeFact[4:] = sizeFact[4:]*(res[[i for i in res.index if '-VHL_24h-' in i]].values/res[[i for i in res.index if '-DMSO_24h-' in i]].values.mean())
deseqMEF2D24.setSizeFactors(sizeFact)
deseqMybi24.run_deseq()
deseqMybi24.get_deseq_result()
resMybi24 = deseqMybi24.deseq_result
resMybi24.pvalue = np.nan_to_num(np.array(resMybi24.pvalue), 1)
resMybi24.log2FoldChange = np.nan_to_num(np.array(resMybi24.log2FoldChange), 0)
resMybi24.log2FoldChange = -resMybi24.log2FoldChange
resMybi24["gene_id"] = resMybi24.genes
deseqMEF2D2.run_deseq()
deseqMEF2D24.run_deseq()
deseqMEF2D2.get_deseq_result()
deseqMEF2D24.get_deseq_result()
resMEF2D2 = deseqMEF2D2.deseq_result
resMEF2D24 = deseqMEF2D24.deseq_result
resMEF2D2.pvalue = np.nan_to_num(np.array(resMEF2D2.pvalue), 1)
resMEF2D2.log2FoldChange = np.nan_to_num(np.array(resMEF2D2.log2FoldChange), 0)
resMEF2D24.pvalue = np.nan_to_num(np.array(resMEF2D24.pvalue), 1)
resMEF2D24.log2FoldChange = np.nan_to_num(np.array(resMEF2D24.log2FoldChange), 0)
resMEF2D24.log2FoldChange = -resMEF2D24.log2FoldChange
resMEF2D2.log2FoldChange = -resMEF2D2.log2FoldChange
resMEF2D2["gene_id"] = resMEF2D2.genes
resMEF2D24["gene_id"] = resMEF2D24.genes
deseqMS2.run_deseq()
deseqJQ1.run_deseq()
deseqMS2_JQ1.run_deseq()
deseqMS2.get_deseq_result()
deseqJQ1.get_deseq_result()
deseqMS2_JQ1.get_deseq_result()
resMS2 = deseqMS2.deseq_result
resJQ1 = deseqJQ1.deseq_result
resMS2_JQ1 = deseqMS2_JQ1.deseq_result
resMS2.pvalue = np.nan_to_num(np.array(resMS2.pvalue), 1)
resMS2.log2FoldChange = np.nan_to_num(np.array(resMS2.log2FoldChange), 0)
resJQ1.pvalue = np.nan_to_num(np.array(resJQ1.pvalue), 1)
resJQ1.log2FoldChange = np.nan_to_num(np.array(resJQ1.log2FoldChange), 0)
resMS2_JQ1.pvalue = np.nan_to_num(np.array(resMS2_JQ1.pvalue), 1)
resMS2_JQ1.log2FoldChange = np.nan_to_num(np.array(resMS2_JQ1.log2FoldChange), 0)
resMS2.log2FoldChange = -resMS2.log2FoldChange
resJQ1.log2FoldChange = -resJQ1.log2FoldChange
resMS2_JQ1.log2FoldChange = -resMS2_JQ1.log2FoldChange
resMS2["gene_id"] = resMS2.genes
resJQ1["gene_id"] = resJQ1.genes
resMS2_JQ1["gene_id"] = resMS2_JQ1.genes
deseqMybi30.run_deseq()
deseqMybi6.run_deseq()
deseqMybi30.get_deseq_result()
deseqMybi6.get_deseq_result()
resMybi30 = deseqMybi30.deseq_result
resMybi6 = deseqMybi6.deseq_result
resMybi30.pvalue = np.nan_to_num(np.array(resMybi30.pvalue), 1)
resMybi30.log2FoldChange = np.nan_to_num(np.array(resMybi30.log2FoldChange), 0)
resMybi6.pvalue = np.nan_to_num(np.array(resMybi6.pvalue), 1)
resMybi6.log2FoldChange = np.nan_to_num(np.array(resMybi6.log2FoldChange), 0)
resMybi6.log2FoldChange = -resMybi6.log2FoldChange
resMybi30.log2FoldChange = -resMybi30.log2FoldChange
resMybi30["gene_id"] = resMybi30.genes
resMybi6["gene_id"] = resMybi6.genes
mix = pd.DataFrame()
mix["gene_id"] = resMybi30["gene_id"]
mix['Mybi 30mn'] = resMybi30.log2FoldChange
mix['Mybi 6h'] = resMybi6.log2FoldChange
scatter(mix[['Mybi 30mn','Mybi 6h']].values[:12000],
mix['gene_id'].values.tolist()[:12000], radi= 0.06, alpha=0.3,
colors = [0 if i in ctf else 1 for i in mix['gene_id'].values.tolist()[:12000]],
xname="Mybi 30mn",
yname="Mybi 6h",
folder='../results/'+project+"/plots/"+version+"_"+scaling+"_",
title='Mybi 30mn vs 6h differences in logFoldChange')
resMS2.to_csv("../results/"+project+"/"+version+'_'+scaling+"_"+str(minvar_toremove)+'_'+str(mincount_toremove)+'_MS2_deseq.csv')
resJQ1.to_csv("../results/"+project+"/"+version+'_'+scaling+"_"+str(minvar_toremove)+'_'+str(mincount_toremove)+'_JQ1_deseq.csv')
resMS2_JQ1.to_csv("../results/"+project+"/"+version+'_'+scaling+"_"+str(minvar_toremove)+'_'+str(mincount_toremove)+'_MS2_JQ1_deseq.csv')
resMybi30.to_csv("../results/"+project+"/"+version+'_'+scaling+'_'+str(minvar_toremove)+'_'+str(mincount_toremove)+'_Mybi_30um_deseq.csv')
resMybi6.to_csv("../results/"+project+"/"+version+'_'+scaling+'_'+str(minvar_toremove)+'_'+str(mincount_toremove)+'_Mybi_6h_deseq.csv')
resMybi24.to_csv("../results/"+project+"/"+version+'_'+scaling+'_'+str(minvar_toremove)+'_'+str(mincount_toremove)+'_Mybi_24h_deseq.csv')
resMEF2D2.to_csv("../results/"+project+"/"+version+'_'+scaling+'_'+str(minvar_toremove)+'_'+str(mincount_toremove)+'_MEF2D_2h_deseq.csv')
resMEF2D24.to_csv("../results/"+project+"/"+version+'_'+scaling+'_'+str(minvar_toremove)+'_'+str(mincount_toremove)+'_MEF2D_24h_deseq.csv')
we can conclude that we get similar results to the slamseq myc paper although it seems that our values are a bit skewed toward higher expression than what is on the slamseq paper. It mightt be explained by the pseudo count of 1 that I did not set. Because I think it would highly bias the DESeq algorithm.
show(volcano(resMS2,tohighlight=ctf, searchbox=True, title='DESeq results of MV411 under MS2 in volcano plot', folder='../results/'+project+'/plots/'+version+'_'+scaling+"_"+str(minvar_toremove)+'_'+str(mincount_toremove)))
show(volcano(resJQ1,tohighlight=ctf, searchbox=True, title='DESeq results of MV411 under JQ1 in volcano plot', folder='../results/'+project+'/plots/'+version+'_'+scaling+"_"+str(minvar_toremove)+'_'+str(mincount_toremove)))
show(volcano(resMS2_JQ1,tohighlight=ctf, searchbox=True, title='DESeq results of MV411 under MS2 and JQ1 in volcano plot', folder='../results/'+project+'/plots/'+version+'_'+scaling+"_"+str(minvar_toremove)+'_'+str(mincount_toremove)))
show(volcano(resMybi30,tohighlight=ctf, searchbox=True, title="Mybi at 30mn", folder='../results/'+project+'/plots/'+version+'_'+scaling+"_"+str(minvar_toremove)+'_'+str(mincount_toremove)))
show(volcano(resMybi6,tohighlight=ctf, searchbox=True, title="Mybi at 6h", folder='../results/'+project+'/plots/'+version+'_'+scaling+"_"+str(minvar_toremove)+'_'+str(mincount_toremove),maxvalue=50))
show(volcano(resMybi24,tohighlight=ctf, searchbox=True, title="Mybi at 24h", folder='../results/'+project+'/plots/'+version+'_'+scaling+"_"+str(minvar_toremove)+'_'+str(mincount_toremove),maxvalue=50))
show(volcano(resMEF2D2,tohighlight=ctf, searchbox=True, title="MEF2D at 2h", folder='../results/'+project+'/plots/'+version+'_'+scaling+"_"+str(minvar_toremove)+'_'+str(mincount_toremove),maxvalue=50))
show(volcano(resMEF2D24,tohighlight=ctf, searchbox=True, title="MEF2D at 24h", folder='../results/'+project+'/plots/'+version+'_'+scaling+"_"+str(minvar_toremove)+'_'+str(mincount_toremove),maxvalue=50))
resMEF2D24 = resMEF2D24.set_index('gene_id')
resMEF2D2 = resMEF2D2.set_index('gene_id')
totresMEF2D24 = resMEF2D24.copy()
totresMEF2D2 = resMEF2D2.copy()
%store -r resMEF2D24
%store -r resMEF2D2
resMEF2D24 = resMEF2D24.set_index('gene_id')
resMEF2D2 = resMEF2D2.set_index('gene_id')
# The correlation is much high in totcounts than read counts
# only the MEF2D @2h tc ccounts and MEF2D @24h totcounts have an high overlap in genes than pass padj of 0.1
set(tcMEF2D2.index) & set(tcMEF2D24.index)
set(tcMEF2D2.index) & set(MEF2D24.index)
set(tcMEF2D2.index) & set(MEF2D2.index)
set(MEF2D2.index) & set(tcMEF2D24.index)
# by removing genes with low pvalue we increase the correlation by a lot!
a = scatter(np.vstack([totresMEF2D24.loc[sim].log2FoldChange.values, resMEF2D2.loc[sim].log2FoldChange.values]).T,
totresMEF2D24.loc[sim].index.tolist(), radi= 0.06, alpha=0.3,
colors = [0 if i in ctf else 1 for i in totresMEF2D24.loc[sim].index.tolist()],
xname="total counts MEF2D @24h",
yname="tc counts MEF2D @2h",
folder='../results/'+project+"/plots/"+version+"_"+scaling+"_",
title='MEF2D tccounts 2h vs readcounts 24h differences in logFoldChange')
def getSimilarity(exp1, exp2, ptype = "pvalue", doplot=True): #could be padj
counts = []
correlations = []
pvalue = []
for val in np.exp(np.invert(list(range(-1,20)))):
expA = exp1[exp1[ptype]<val].log2FoldChange
expB = exp2[exp2[ptype]<val].log2FoldChange
expA = expA[expA!=0.]
expB = expB[expB!=0.]
simigenes = set(expA.index) & set(expB.index)
counts.append(len(simigenes))
if len(simigenes) < 2:
a = 0
b = 0
else:
a,b = spearmanr(expA.loc[simigenes], expB.loc[simigenes])
pvalue.append(b)
correlations.append(a)
ran = np.exp(np.invert(list(range(-1,20))))
if doplot:
fig, ax1 = plt.subplots()
ax1.set_xscale('log')
ax2 = ax1.twinx()
ax1.plot(ran, np.array(counts)+1, 'o-', color="blue")
ax1.set_yscale('log')
ax2.plot(ran, correlations, 'o-', color="red" )
ax1.set_ylabel('overlaping genes', color='b')
ax2.set_ylabel('correlation', color='r')
plt.show()
return counts, pvalue, correlations, ran
counts, pvalue, correlations, ran = getSimilarity(resMEF2D24, totresMEF2D24)
counts, pvalue, correlations, ran = getSimilarity(resMEF2D2, totresMEF2D24)
counts, pvalue, correlations, ran = getSimilarity(resMEF2D2, totresMEF2D2)
counts, pvalue, correlations, ran = getSimilarity(totresMEF2D2, totresMEF2D24)
counts, pvalue, correlations, ran = getSimilarity(resMEF2D2, resMEF2D24)
tcMEF2D2 = resMEF2D2[resMEF2D2.pvalue<0.05].log2FoldChange
MEF2D24 = totresMEF2D24[totresMEF2D24.pvalue<0.05].log2FoldChange
sim = set(tcMEF2D2.index) & set(MEF2D24.index)
sim & set(ctf)
tcMEF2D2 = resMEF2D2[resMEF2D2.padj<0.1].log2FoldChange
MEF2D24 = totresMEF2D24[totresMEF2D24.padj<0.1].log2FoldChange
sim = set(tcMEF2D2.index) & set(MEF2D24.index)
sim & set(ctf)
resMybi24 = resMybi24.set_index('gene_id')
resMybi6 = resMybi6.set_index('gene_id')
resMybi30 = resMybi30.set_index('gene_id')
totresMybi24 = resMybi24.copy()
totresMybi6 = resMybi6 .copy()
totresMybi30 = resMybi30.copy()
%store -r resMybi24
%store -r resMybi6
%store -r resMybi30
resMybi24 = resMybi24.set_index('gene_id')
resMybi6 = resMybi6.set_index('gene_id')
resMybi30 = resMybi30.set_index('gene_id')
counts, pvalue, correlations, ran = getSimilarity(resMybi30, resMybi6)
counts, pvalue, correlations, ran = getSimilarity(resMybi30, totresMybi30)
counts, pvalue, correlations, ran = getSimilarity(resMybi30, totresMybi6)
We can really see that although we can be fooled by the correlation at some definite moments, its patter of change according to the threshold applied provides interesting results.
Once including things with only a pval below 10^-6 the correlation in stronger at 30mn-tot vs 30mn-tc than 6h-tc vs 30mn-tc. Same thing for 6h-tot vs 30mn-tc, But here we keep even more similar genes than for the other examples!
counts, pvalue, correlations, ran = getSimilarity(resMybi30, totresMybi30, "padj")
counts, pvalue, correlations, ran = getSimilarity(resMybi30, totresMybi6, "padj")
Same thing using the adjusted pvalue, we see more overlapping genes in 30mn-tc vs 6h-tot than for others.
counts, pvalue, correlations, ran = getSimilarity(resMybi6, totresMybi6, "padj")
counts, pvalue, correlations, ran = getSimilarity(resMybi6, totresMybi6)
Even if the pvalue increase when looking at the 6h-tc vs 6h-tot, it is also due to the fact that they are the same sequencing run, the same experiment and it contains less genes.
counts, pvalue, correlations, ran = getSimilarity(resMybi6, totresMybi24)
counts, pvalue, correlations, ran = getSimilarity(resMybi24, totresMybi24)
counts, pvalue, correlations, ran = getSimilarity(totresMybi6, totresMybi24)
counts, pvalue, correlations, ran = getSimilarity(resMybi, resMybi24)
and we can see that things start to diverge at 24hs, whether it is tc counts or total counts. This can be seen by how widely different these replicates were from themselves and from the rest.
This tend to show how slamseq is almost predicting for a subset of the most confident genes on DESeq, their value in 6hours from now in totalcounts.